Library calls

library(tidyverse)
library(plotly)
library(readr)
library(patchwork)

knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%")

theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis")

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Rat Sightings Data

Load and clean NYC rat sighting data:

rat_df = read_csv("Rat_Sightings.csv") %>%
  janitor::clean_names() %>%
  mutate(created_date = gsub(" .*","", created_date),
         address_type = str_to_title(address_type),
         city = str_to_title(city),
         borough = str_to_title(borough),
         location_type = recode(location_type, "Other (Explain Below)" = "Other")) %>%
  separate(created_date, into = c("month", "day", "year"), sep = "/") %>%
  mutate(year = as.numeric(year),
         month = as.numeric(month),
         day = as.numeric(day)) %>%
  filter(borough != "Unspecified") %>%
  select(month, day, year, location_type, address_type, city, borough, latitude, longitude)
## Rows: 207580 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Created Date, Closed Date, Agency, Agency Name, Complaint Type, De...
## dbl  (6): Unique Key, Incident Zip, X Coordinate (State Plane), Y Coordinate...
## lgl  (7): Vehicle Type, Taxi Company Borough, Taxi Pick Up Location, Bridge ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Asthma Data

Load NYC asthma emergency visit data for adults:

asthma_df = read_csv("asthma_ER_visits_adults.csv") %>%
  janitor::clean_names() %>%
  rename(
    year = time
  ) %>%
   mutate(age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
         estimated_annual_rate_per_10_000 = as.numeric(estimated_annual_rate_per_10_000),
         number = as.numeric(number)) %>%
  filter(geo_type == "Borough") %>%
  group_by(year, geography) %>%
  select(year, geography, age_adjusted_rate_per_10_000, estimated_annual_rate_per_10_000, number)
## Rows: 584 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): GeoType, Geography, Age-adjusted rate per 10,000, Estimated annual ...
## dbl (3): Time, GeoID, GeoRank
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Merge datasets by year and borough:

asthma_rat_df <- left_join(rat_df, asthma_df, by = c('year' = 'year', 'borough' = 'geography'))

Map sightings by coordinate:

Heat map w/density

  rat_df %>%
  plot_ly(
    type = 'densitymapbox',
    lat = ~latitude,
    lon = ~longitude,
    coloraxis = 'coloraxis',
    radius = 10,
    color = ~borough) %>%
  layout(
    mapbox = list(
      style = "stamen-terrain",
      zoom = 9,
      center = list(lon = -73.9, lat = 40.7)), 
    coloraxis = list(colorscale = "Viridis"), 
    title = ("Density Map of Rat Sightings"))

Plot of sightings over time in NYC overall:

overall_rat_line =
  rat_df %>%
  group_by(year) %>%
  count() %>%
  summarise(n_obs = n) %>% 
  ggplot(aes(x = year, y = n_obs)) + 
  scale_x_continuous(breaks = seq(2010, 2022, 2)) +
  geom_line() +
  labs(
    title = "Rat Sightings Over Time in NYC",
    x = "Year",
    y = "Number of Sightings")

overall_rat_line

Plot of sightings over time in NYC by borough:

rat_line = rat_df %>% 
  group_by(borough, year) %>%  
  count() %>%
  summarise(n_obs = n) %>% 
  ggplot(aes(x = year, y = n_obs , color = borough )) + 
  geom_line() +
  scale_x_continuous(breaks = seq(2010, 2022, by = 1)) +
  labs(
    title = "Rat Sightings Over Time by Borough",
    x = "Year",
    y = "Number of Sightings")
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
rat_line

Bar graph of all sightings by borough:

rat_bar = 
  rat_df %>% 
  count(borough) %>% 
  mutate(borough = fct_reorder(borough, n)) %>% 
  ggplot(aes(x = borough, y = n, fill = borough)) + 
  geom_bar(stat = "identity") +
  labs(
    title = "Frequency of Rat Sightings by Borough (2010-2022)",
    x = "Borough",
    y = "Number of Sightings",
    fill = "Borough")
  
  #plot_ly(x = ~borough, y = ~n, color = ~borough, type = "bar", colors = "viridis")

rat_bar

Plot of frequency of sightings by borough and year:

rat_bar_time =
  rat_df %>% 
  group_by(borough) %>%
  count(year) %>% 
  ggplot(aes(x = year, y = n, fill = borough)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  scale_x_continuous(breaks = seq(2010, 2022, by = 1)) +
  labs(
    title = "Frequency of Sightings in Boroughs Over Time",
    x = "Year",
    y = "Number of Sightings",
    fill = "Borough")

rat_bar_time

Location type:

location_bar =
  rat_df %>%
  count(location_type) %>%
   mutate(
    location_type = fct_reorder(location_type, n),
    ranking = min_rank(desc(n))) %>% 
  filter(ranking <= 10) %>% 
  arrange(n) %>%
  ggplot(aes(x = location_type, y = n, fill = location_type)) + 
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Location Types",
       x = "Location Type",
       y = "Number of Sightings",
       fill = "location_type") + coord_flip() +
  theme(legend.position = "none") 

location_bar

location_grid =
  rat_df %>%
  group_by(borough, location_type) %>%
  count() %>%
  summarise(n_obs = n) %>%
   mutate(
    location_type = fct_reorder(location_type, n_obs),
    ranking = min_rank(desc(n_obs))) %>% 
  filter(ranking <= 3) %>% 
  arrange(ranking) %>%
  ggplot(aes(x = location_type, y = n_obs, fill = location_type)) + 
  geom_bar(stat = "identity") +
  facet_grid(. ~ borough) +
  labs(title = "Top Sighting Locations by Borough",
       x = "Location Type",
       y = "Number of Sightings",
       fill = "location_type") +
  theme(axis.text.x = element_blank())
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
  #theme(axis.text.x = element_text(angle = 90, hjust = 1))

location_grid

Rat asthma plot:

asthma_line =
  asthma_rat_df %>%
  filter(year > 2009) %>%
  mutate(age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000, na.rm = TRUE)) %>%
  group_by(year) %>%
  summarize(mean_age_adjust = mean(age_adjusted_rate_per_10_000)) %>%
  drop_na(mean_age_adjust) %>%
  ggplot(aes(x = year, y = mean_age_adjust)) + 
  geom_line() +
  labs(
    title = "Asthma ED Visits Over Time in NYC",
    x = "Year",
    y = "Mean age adjusted rate per 10,000")

asthma_line + overall_rat_line

ASTHMA PLOTS

Asthma cases in nyc over time

asthma_num_df =
  asthma_df %>%
  mutate(
    number = as.numeric(number), 
    data = "Total asthma ER visits"
  ) %>%
  group_by(year, data) %>%
  summarise(n_obs = sum(number, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_rate_df =
  asthma_df %>%
  mutate(
    age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
    data = "Age adjusted asthma ER visit rate"
  ) %>%
  group_by(year, data) %>%
  summarise(n_obs = mean(age_adjusted_rate_per_10_000, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_estimated_df =
  asthma_df %>%
  mutate(
    estimated_annual_rate_per_10_000 = as.numeric(estimated_annual_rate_per_10_000), 
    data = "Estimated annual asthma rate"
  ) %>%
  group_by(year, data) %>%
  summarise(n_obs = mean(estimated_annual_rate_per_10_000, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_years_num_plot =
  asthma_num_df %>%
  ggplot(aes(x = year, y = n_obs)) + 
  geom_line() +
  scale_x_continuous(breaks = seq(2005, 2018, 1)) +
  labs(title = "Total number of asthma ER visits per year",
       x = "Year",
       y = "Total number of asthma ER visits")
  

asthma_rate_estimated_plot = 
  ggplot(data = asthma_rate_df, aes(x = year, y = n_obs, color = data)) + 
  geom_line() +
  geom_line(data = asthma_estimated_df) +
  scale_x_continuous(breaks = seq(2005, 2018, 1)) +
  labs(title = "Age adjusted vs. Estimated Asthma Rates Over The Years",
       x = "Year",
       y = "Rate per 10,000 people")
  

asthma_years_num_plot

asthma_rate_estimated_plot

Mean asthma rate and estimated asthma rate per borough

asthma_rate_borough_plot = 
  asthma_df %>% 
  group_by(geography) %>% 
  mutate(
    age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
    geography = as.factor(geography)
  ) %>%
  summarise(
    asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
  ) %>%
  mutate(geography = fct_reorder(geography, asthma_rate)) %>% 
  ggplot(aes(x = geography, y = asthma_rate, fill = geography)) + 
  geom_bar(stat = "identity") +
  labs(
    title = "Mean asthma rate per borough (2004-2018)",
    x = "Borough",
    y = "Mean Age-Adjusted Asthma Rate per 10,000",
    fill = "Borough")

asthma_rate_borough_plot

Asthma rate per borough over time

asthma_borough_time_plot = 
  asthma_df %>% 
  group_by(geography, year) %>% 
  mutate(
    age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
    geography = as.factor(geography)
  ) %>%
  summarise(
    asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
  ) %>%
  ggplot(aes(x = year, y = asthma_rate, fill = geography)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  scale_x_continuous(breaks = seq(2005, 2018, by = 1)) +
  labs(
    title = "Mean Asthma ER Visits in Boroughs Over Time",
    x = "Year",
    y = "Mean Asthma ER Visits per 10,000",
    fill = "Borough")
## `summarise()` has grouped output by 'geography'. You can override using the
## `.groups` argument.
asthma_borough_time_plot

Statistical Analysis

Question: Do asthma rates differ based on rat sightings?

Statistical modeling plan: Run a regression model, such that: outcome variable = asthma rate, predictor variable = rat sightings, control for = year

rat_df2 = rat_df %>%
  group_by(year) %>%
  count() %>%
  summarise(
    n_rat_sightings = n
  )

asthma_df2 = asthma_df %>%
  mutate(
    age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000)
  ) %>%
  group_by(year) %>%
  summarise(
    mean_asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
  )

asthma_rat_df2 = left_join(rat_df2, asthma_df2, by = 'year')

asthma_rat_mod = lm( mean_asthma_rate ~ n_rat_sightings + year, data = asthma_rat_df2)

asthma_rat_mod %>%
  broom::tidy()
## # A tibble: 3 × 5
##   term               estimate  std.error statistic p.value
##   <chr>                 <dbl>      <dbl>     <dbl>   <dbl>
## 1 (Intercept)     -3425.      4643.         -0.738   0.502
## 2 n_rat_sightings    -0.00425    0.00207    -2.05    0.109
## 3 year                1.78       2.32        0.769   0.485

Check linear model assumptions

# 1. Linear relationship
plot(asthma_rat_mod, 1)

# 2. Normality of residuals
hist(asthma_rat_mod$residuals)

plot(asthma_rat_mod, 2)

# 3. Testing the homoscedasticity assumption
# Plotting residuals
plot(asthma_rat_mod, 3)

# Breusch-Pagan test
lmtest::bptest(asthma_rat_mod)
## 
##  studentized Breusch-Pagan test
## 
## data:  asthma_rat_mod
## BP = 3.6938, df = 2, p-value = 0.1577